Exploratory Data Analysis (cont’d): - Pairs plot to show correlation between variables and avoid multicollinearity (see 8.2 Many predictors in a model) Logistic Regression seen as an evolution of techniques - Linear Model to show simplest multivariate regression, but predictions can be outside the binary values. - Generalized Linear Model uses a logit transformation to constrain the outputs to being within two values. - Generalized Additive Model allows for “wiggle” in predictor terms. - Maxent (Maximum Entropy) is a presence-only modeling technique that allows for a more complex set of shapes between predictor and response.
librarian::shelf(
DT, dplyr, dismo, GGally, here, readr, tidyr)
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)
dir_data <- here("data")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 1552
datatable(pts_env, rownames = F)
GGally to look at pair plots and examine correlations between environmental variablesGGally::ggpairs(
select(pts_env, -ID),
aes(color = factor(present)))
# Logistic Regression ## Setup Data - drop rows of data with any NA values (later, we’ll learn how to impute values) - remove terms we don’t want to model
d <- pts_env %>%
select(-ID) %>% # remove terms we don't want to model
tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 1548
mod <- lm(present ~ ., data = d)
summary(mod)
##
## Call:
## lm(formula = present ~ ., data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.06039 -0.30499 0.06589 0.28673 1.08016
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.668e-01 9.043e-01 -0.627 0.53093
## WC_alt -5.757e-05 1.022e-04 -0.563 0.57327
## WC_bio1 1.650e-01 1.918e-02 8.603 < 2e-16 ***
## WC_bio4 -2.082e-03 2.131e-03 -0.977 0.32876
## WC_bio12 -1.889e-03 2.311e-04 -8.175 6.11e-16 ***
## ER_climaticMoistureIndex 1.622e+00 2.271e-01 7.140 1.43e-12 ***
## ER_tri 4.292e-03 1.405e-03 3.055 0.00229 **
## ER_topoWet 9.895e-03 1.621e-02 0.610 0.54163
## lon 2.432e-02 2.519e-03 9.654 < 2e-16 ***
## lat 8.610e-02 1.472e-02 5.850 6.00e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4042 on 1538 degrees of freedom
## Multiple R-squared: 0.3507, Adjusted R-squared: 0.3469
## F-statistic: 92.31 on 9 and 1538 DF, p-value: < 2.2e-16
Note: a linear model is ineffective because it predicts values outside the 0 - 1 range
y_predict <- predict(mod, pts_env, type="response")
y_true <- pts_env$present
range(y_predict)
## [1] NA NA
range(y_true)
## [1] 0 1
To solve this problem, we will apply a Logit Transformation
# fit a generalized linear model with a binomial logit link function
mod <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mod)
##
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"),
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4482 -0.7997 -0.1347 0.7655 2.5857
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.9805150 6.3935208 -0.466 0.64109
## WC_alt -0.0007000 0.0006809 -1.028 0.30393
## WC_bio1 0.8844692 0.1320449 6.698 2.11e-11 ***
## WC_bio4 0.0060152 0.0134718 0.446 0.65524
## WC_bio12 -0.0112959 0.0015053 -7.504 6.18e-14 ***
## ER_climaticMoistureIndex 10.3772734 1.5333428 6.768 1.31e-11 ***
## ER_tri 0.0287820 0.0093305 3.085 0.00204 **
## ER_topoWet 0.0731181 0.1003126 0.729 0.46606
## lon 0.1188077 0.0160109 7.420 1.17e-13 ***
## lat 0.3553762 0.1109841 3.202 0.00136 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2146.0 on 1547 degrees of freedom
## Residual deviance: 1512.3 on 1538 degrees of freedom
## AIC: 1532.3
##
## Number of Fisher Scoring iterations: 5
Note: we are not within the 0 - 1 range we want to be in
y_predict <- predict(mod, d, type="response")
range(y_predict)
## [1] 0.006823366 0.950054989
termplot(mod, partial.resid = TRUE, se = TRUE, main = F)
We can further improve the GLM by adding “wiggle” to the relationship between the predictor and response variables
librarian::shelf(mgcv)
# fit a generalized additive model with smooth predictors
mod <- mgcv::gam(
formula = present ~ s(WC_alt) +
s(WC_bio1) +
s(WC_bio4) +
s(WC_bio12) +
s(ER_climaticMoistureIndex) +
s(ER_tri) +
s(ER_topoWet) +
s(lon) +
s(lat),
family = binomial, data = d)
summary(mod)
##
## Family: binomial
## Link function: logit
##
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio4) + s(WC_bio12) +
## s(ER_climaticMoistureIndex) + s(ER_tri) + s(ER_topoWet) +
## s(lon) + s(lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -74.78 36.89 -2.027 0.0427 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(WC_alt) 2.564 3.201 6.335 0.119962
## s(WC_bio1) 4.207 5.165 23.346 0.000436 ***
## s(WC_bio4) 5.911 7.076 30.629 8.15e-05 ***
## s(WC_bio12) 7.845 8.500 40.908 < 2e-16 ***
## s(ER_climaticMoistureIndex) 7.694 8.129 20.069 0.010477 *
## s(ER_tri) 8.455 8.711 30.665 0.000500 ***
## s(ER_topoWet) 7.710 8.091 15.253 0.075929 .
## s(lon) 8.877 8.982 73.399 < 2e-16 ***
## s(lat) 5.175 6.313 15.520 0.015247 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.541 Deviance explained = 49.8%
## UBRE = -0.22746 Scale est. = 1 n = 1548
plot(mod, scale=0)
This is the most commonly used species distribution model because it requires few input data points, all of which can be presence observation points, and is easy to use with a Java GUI.
# load extra packages
librarian::shelf(
maptools, sf)
# show version of maxent
maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)
# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>%
sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class
# fit a maximum entropy model
mod <- maxent(env_stack, obs_sp)
## This is MaxEnt version 3.4.3
# plot variable contributions per predictor
plot(mod)
# plot term plots
response(mod)
## Predict
# predict
y_predict <- predict(env_stack, mod) #, ext=ext, progress='')
plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')